#!/usr/bin/env python3

# Copyright (c) 2023, Arm Limited and Contributors. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from git import Repo
import lxml.html
import os
import re


def _git_to_web_url(url):
    """
    Convert a git HTTPS or SSH URL to a GitLab web URL.
    """
    if not re.match(r"^https?://", url):
        # Attempt to convert SSH to HTTPS
        ssh_parts = re.split(r"[@:]", url)
        if len(ssh_parts) != 3:
            raise Exception(f"Not a valid HTTPS or SSH URL: {url}")
        url = f"https://{ssh_parts[1]}/{ssh_parts[2]}"

    # Trim off any ".git" at the end
    return re.sub(r"\.git$", "", url)


def _get_base_url():
    """
    Get the base GitLab web URL that will be prepended to each file link.
    """
    # Use predefined GitLab CI variables if available, as they are reliable and work out
    # of the box.
    project_url = os.environ.get("CI_PROJECT_URL")
    commit_sha = os.environ.get("CI_COMMIT_SHA")
    if project_url and commit_sha:
        return f"{project_url}/-/tree/{commit_sha}"

    # If the CI variables are undefined (e.g. when running MkDocs on a local machine),
    # fall back to extracting information from the local git repository.
    # Limitation: We can only assume the first URL of "origin" is what we want.
    repo = Repo(os.curdir)
    commit_sha = repo.head.commit.hexsha
    project_url = _git_to_web_url(next(repo.remotes.origin.urls))
    return f"{project_url}/-/tree/{commit_sha}"


def on_page_read_source(page, **kwargs):
    """
    MkDocs hook for reading a raw input markdown source file from the filesystem.
    """
    # Before on_page_read_source(), MkDocs sets page.title to the optional title
    # specified in mkdocs.yml, which can be empty at this point. Sometime after
    # on_page_read_source(), MkDocs replaces any empty title to either of
    #
    # * the top heading of the markdown file, if it finds one
    # * the file name, which is not ideal
    #
    # If a markdown file inlines another markdown file into it, MkDocs is unable to
    # find the inlined markdown file's heading, so it falls back to using the file name
    # as the page title. To work around this, we use page.untitled = True to mark an
    # originally untitled page and then change the title to the top HTML heading
    # during on_page_content().
    page.untitled = not page.title


def on_page_markdown(markdown, page, config, **kwargs):
    """
    MkDocs hook for preprocessing an input markdown page.
    """
    # Match the single line format snippet notation, i.e. '--8<-- "filename.ext"'.
    result = re.search(r'-+8<-+\s*"(?P<inlined_page>.+)"', markdown)

    # If the page inlines another page, record the inlined page's path.
    # Note: MkDoc's "page" object originally contains no "inlined_page" attribute.
    # This is added by our plugin.
    page.inlined_page = result["inlined_page"] if result else None
    return markdown


def on_page_content(html, page, **kwargs):
    """
    MkDocs hook for postprocessing an output HTML page.
    """
    base_url = _get_base_url()
    page_path = page.inlined_page if page.inlined_page else f"docs/{page.url}"
    page_dir = os.path.dirname(page_path)
    html_obj = lxml.html.document_fromstring(html)

    # If the page's content is inlined *and* no title is given in mkdocs.yml, MkDocs
    # falls back to using the file name as the title which is not ideal. To improve
    # this, we set the first h1 heading as the page title.
    if page.inlined_page and page.untitled:
        page.title = html_obj.findall(".//h1")[0].text

    def rewrite_links_callback(link):
        # HTTPS, SSH (@), mailto (@) and same-page anchor (#) links need no conversion
        if re.match(r"(^https?://|.*@|^#)", link):
            return link

        # Normalize the link to be relative to the root of the repository instead of the
        # current page.
        norm_link = os.path.normpath(os.path.join(page_dir, link))
        if norm_link.split(os.sep)[0] == "docs":
            # The entire contents of docs/ (including non-documentation files) are
            # always added by MkDocs into the website folder, so they should not be
            # converted to GitLab repository links.
            if page.inlined_page:
                # MkDocs only maps links from .md to .html in non-inlined pages, so we
                # need to handle links in inlined pages by ourselves. Also, we need to
                # strip off the leading docs/ component.
                link = re.sub(r".md$", ".html", norm_link).lstrip("docs/")
            return link

        if norm_link.split(os.sep)[0] == os.pardir:
            raise Exception(
                f"Unable to convert {link}: cannot be above the root directory"
            )

        if not os.path.exists(norm_link):
            raise Exception(
                f"The link {link} in {page_path} points to non-existent {norm_link}"
            )

        return f"{base_url}/{norm_link}"

    html_obj.rewrite_links(rewrite_links_callback)
    return lxml.html.tostring(html_obj).decode()
